from termcolor import colored
from sklearn.tree import DecisionTreeClassifier
import missingno as msno
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import pickle
import pprint
from sklearn.ensemble import RandomForestRegressor
from pandas_profiling import ProfileReport
from dateutil import relativedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from statsmodels.regression.linear_model import OLS
plot_______ = False
plot_______ = True
def new_line():
print("\n-------------------------\n")
def RMSE(predictions):
return round(np.sqrt(((test_y - predictions)**2).mean()))
def plot_numerical_columns(col_name):
if not plot_______:
return None
# Histogram
df[col_name].plot(kind="hist", figsize=(13,8));
plt.title(col_name, size=18);
plt.axhline(y=df[col_name].mean(), color='red');
plt.axhline(y=df[col_name].median(), color='green');
plt.legend(['Actual', 'Mean', 'Median']);
plt.show()
# Scatter plot
df[col_name].plot(figsize=(13,8));
plt.title(col_name, size=18);
plt.axhline(y=df[col_name].mean(), color='red');
plt.axhline(y=df[col_name].median(), color='green');
plt.legend(['Actual', 'Mean', 'Median']);
plt.show()
# scatter plot (sort by values), values Vs index
df[col_name].sort_values().reset_index(drop=True).plot(figsize=(13,8));
plt.title(col_name+" (SORTED)", size=18);
plt.axhline(y=df[col_name].mean(), color='red');
plt.axhline(y=df[col_name].median(), color='green');
plt.legend(['Actual', 'Mean', 'Median']);
plt.show()
# box plot
df[col_name].plot(kind="box", figsize=(13,8))
plt.title(col_name, size=18);
plt.xlabel("");
plt.show()
def plot_date_columns(col_name):
if not plot_______:
return None
df[col_name].plot(figsize=(15,7), grid=True);
plt.xlabel("Index", size=14);
plt.ylabel("Date", size=14);
plt.title(col_name + " Graph", size=18);
plt.show();
df[col_name].sort_values().reset_index(drop=True).plot(figsize=(15,7), grid=True);
plt.xlabel("Index (sorted)", size=14);
plt.ylabel("Year", size=14);
plt.title(col_name + " Graph", size=18);
plt.show();
(df[col_name].dt.year.value_counts(sort=False).sort_index() / len(df) * 100).plot(kind="bar", figsize=(15,7), grid=True);
plt.xlabel("Year", size=14);
plt.ylabel("Ratio (1-100)", size=14);
plt.title(col_name + " year Frequency Graph", size=18);
plt.show();
(df[col_name].dt.month.value_counts().sort_index()/len(df) * 100).plot(kind="bar", figsize=(15,7), grid=True);
plt.xlabel("Month", size=14);
plt.ylabel("Ratio (1-100)", size=14);
plt.title(col_name + " month Frequency Graph", size=18);
plt.show();
(df[col_name].dt.day.value_counts().sort_index()/len(df) * 100).plot(kind="bar", figsize=(15,7), grid=True);
plt.xlabel("Day", size=14);
plt.ylabel("Ratio (1-100)", size=14);
plt.title(col_name + " Day Frequency Graph", size=18);
plt.show();
def plot_catagorical_columns(cat_variable):
if not plot_______:
return None
(df[cat_variable].value_counts() / len(df) * 100).plot.bar(figsize=(15,6), grid=True);
plt.title(cat_variable, size=18, color='r');
plt.xlabel("Catagory", size=14, color='r');
plt.ylabel("Ratio (1-100)", size=14, color='r');
plt.show()
def data_shape():
return f"The Data have:\n\t{df.shape[0]} rows\n\t{df.shape[1]} columns\n"
#===
# df = pd.read_csv("data.csv", date_parser=True)
# df = pd.read_csv("df_only_selected_columns_using_PCA.csv", date_parser=True)
# target_variable = "ACTUAL_WORTH"
# df = pd.concat([
# df.select_dtypes("number").iloc[:, :3],
# df.select_dtypes("O").iloc[:, :3],
# df.select_dtypes(exclude=["number", "O"]),
# df[[target_variable]]], 1)
# target_variable = "AREA_NAME_EN"
# df = pd.read_csv("cleaned_data.csv", date_parser=True)
# target_variable = "SalePrice"
train = pd.read_csv("/home/amir/Downloads/train.csv")
test = pd.read_csv("/home/amir/Downloads/test.csv")
target_variable = "SalePrice"
train_y = train[target_variable]
train = train.drop(columns=target_variable)
df = pd.concat([train, test])
df[target_variable] = train_y.to_list() + [None]*len(test)
#===
new_line()
print(data_shape())
#===
new_line()
print(f"Columns types distribution:\n\n{df.dtypes.value_counts()}\n")
df.dtypes.value_counts().plot(kind='barh', figsize=(10, 2), grid=True, title="Variable types Count Graph");
plt.xlabel("Count");
plt.show()
#===
f = df[target_variable].isna().sum()
if f:
new_line()
to_print = f"There are {f} NAs in target values, we droped those rows"
print(colored(to_print, 'red'))
df = df[df[target_variable].notna()]
del f
#---------------------------------------------------
# df.select_dtypes("O").columns[:5]
# D = df.select_dtypes(exclude="O")
# D2 = df.select_dtypes("O").iloc[:,:5]
# df = pd.concat([D, D2], 1)
# profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
# profile.to_file("your_report.html")
#---------------------------------------- NA
a = df.isna().sum().where(lambda x:x>0).dropna()
if a.size:
new_line()
to_print = f"There are {len(a)} (out of {df.shape[1]}, [{round(len(a)/df.shape[1]*100)}%]) columns that contains 1 or more NA."
print(colored(to_print, 'red'))
for i in a.index:
df[i+"_NA_indicator"] = df[i].isna().replace({True : "Missing", False : "Not missing"})
new_line()
to_print = f"{a.size} NA_indicator variables added to the data\n"
print(colored(to_print, 'red'))
print("========= NA Graphs =========\n")
msno.matrix(df);
plt.title("NA Graph");
plt.show()
new_line()
sns.heatmap(df.isnull(), cbar=False);
plt.title("NA Graph");
plt.show()
#===
a = a.sort_values()/len(df)*100
if (a == 100).sum():
new_line()
df.drop(columns=a[a==100].index, inplace=True)
to_print = f"There are {(a == 100).sum()} columns that are all Missing values, so we droped those.\nNow {data_shape()}\n\nDropped columns names:"
print(colored(to_print, 'red'))
for i in a[a==100].index:
print("\t",i)
a = a[a != 100]
#===
x = df[a.index].dtypes.value_counts()
if x.size:
new_line()
print(f"NA columns data type Distribution:\n\n{x}")
del x
#===
new_line()
if a.size:
print(f"NaN Ratio (0-100)\n\n{a}")
else:
print(colored("Now There is no NaN value in our Data", 'red'))
#===
# ----------------------------------------------- Imputing Missing values
# ------------------------------------ Numerical columns imputing
if df.select_dtypes("number").isna().sum().sum():
new_line()
print(f'(Before Missing values treatment)\nThere are {df.isna().sum().sum()} Missing values:\n\t{df.select_dtypes("O").isna().sum().sum()} in catagorical variables\n\t{df.select_dtypes("number").isna().sum().sum()} in numerical columns\n\t{df.select_dtypes(exclude=["O", "number"]).isna().sum().sum()} in others')
from sklearn.impute import KNNImputer
df_not_a_number = df.select_dtypes(exclude="number")
df_number = df.select_dtypes("number")
del df
imputer = KNNImputer(n_neighbors=4, weights="uniform")
imputed = imputer.fit_transform(df_number)
df_number = pd.DataFrame(imputed, columns=df_number.columns)
df = pd.concat([df_not_a_number.reset_index(drop=True), df_number.reset_index(drop=True)], axis=1)
del df_not_a_number
del df_number
print(f'\n(After filling numeric missing values)\nThere are {df.isna().sum().sum()} Missing values:\n\t{df.select_dtypes("O").isna().sum().sum()} in catagorical variables\n\t{df.select_dtypes("number").isna().sum().sum()} in numerical columns\n\t{df.select_dtypes(exclude=["O", "number"]).isna().sum().sum()} in others')
#===
# -------------------------------- Catagoriacal variables imputating
vars_to_fill = df.select_dtypes("O").isna().mean().where(lambda x:x>0).dropna().sort_values(ascending=True)
if vars_to_fill.size:
for col in vars_to_fill.index:
tr = pd.concat([df[[col]], df.loc[:,df.isna().sum() == 0]], 1)
tr_y = tr[col]
tr_X = tr.drop(columns=col)
tr_T = tr_X.select_dtypes("number")
cat_cols = pd.get_dummies(tr_X.select_dtypes(exclude="number"), prefix_sep="__")
tr_T[cat_cols.columns.to_list()] = cat_cols
tr_T[col] = tr_y
tr = tr_T.copy("deep")
train = tr[tr[col].notna()]
test = tr[tr[col].isna()]
train_y = train[col]
train_X = train.drop(columns=col)
test_X = test.drop(columns=col)
clf = DecisionTreeClassifier().fit(train_X, train_y)
test_y = clf.predict(test_X)
df.loc[df[col].isna(), col] = test_y
new_line()
print(f"Missing values imputed, Now there are {df.isna().sum().sum()} Missing values")
# ----------------------------------------------- END Imputing Missing values
# --------------------------------------------------------- Unique values
only_one_unique_value = df.nunique().where(lambda x:x == 1).dropna()
if only_one_unique_value.size:
new_line()
df.drop(columns=only_one_unique_value.index, inplace=True)
last_ = ("", "it") if only_one_unique_value.size == 1 else ("s", "those")
to_print = f"There are {only_one_unique_value.size} variable{last_[0]} That have only one unique value, so we droped {last_[1]}.\nDropped column{last_[0]} name{last_[0]} (in order):"
print(colored(to_print, 'red'))
for i in only_one_unique_value.index.sort_values():
print(i)
new_line()
print(f"\nNow {data_shape()}")
del only_one_unique_value
# #===
all_values_are_unique = df.apply(lambda x:x.is_unique).where(lambda x:x==True).dropna()
if all_values_are_unique.size:
new_line()
df.drop(columns=all_values_are_unique.index, inplace=True)
last_ = ("", "it") if all_values_are_unique.size == 1 else ("s", "those")
to_print = f"There are {all_values_are_unique.size} column{last_[0]} that have all unique values, so no value repeatation, we droped {last_[1]} column{last_[0]}.\nDropped column{last_[0]} name{last_[0]} are:\n"
print(colored(to_print, 'red'))
for i in all_values_are_unique.index:
print("\t", i)
new_line()
print(f"Now {data_shape()}")
del all_values_are_unique
#===
date_columns = []
def DTYPES():
global date_columns
catagorical_columns = df.head().select_dtypes("O").columns
numerical_columns = df.head().select_dtypes("number").columns
date_columns = []
for i in catagorical_columns:
try:
df[i] = pd.to_datetime(df[i])
date_columns.append(i)
except:
pass
catagorical_columns = catagorical_columns.drop(date_columns)
if date_columns:
date_columns = pd.Index(date_columns)
#===
if not catagorical_columns.append(numerical_columns).append(date_columns).is_unique:
new_line()
print(colored("Some column/s repated in > 1 dtypes\n", 'red'))
dtypes = pd.DataFrame({"Column" : catagorical_columns.append(numerical_columns).append(date_columns),
"dtype" : ['O']*len(catagorical_columns) + ['Number']*len(numerical_columns) + ['Date']*len(date_columns)})
print(dtypes[dtypes.Column.isin(list(dtypes[dtypes.Column.duplicated()].Column.values))].to_string())
#===
x = df.columns.difference(
catagorical_columns.append(numerical_columns).append(date_columns)
)
if x.size:
new_line()
print(colored("Some columns not included in any existing catagory, those:\n", 'red'))
for i in x:
print(f"\t<{i}, with dtype of <{df[i].dtype}>")
#===
dtypes = pd.DataFrame({"Column" : catagorical_columns.append(numerical_columns).append(date_columns),
"dtype" : ['Object']*len(catagorical_columns) + ['Number']*len(numerical_columns) + ['Date']*len(date_columns)})
return dtypes
#===
dtypes = DTYPES()
# ----------------------------------------------------------------------- Feature enginearing
# ======= Adding date columns
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> add polynomial, sqrt, tree, log features
def add_new_date_cols(x, suffix):
d = {}
d[suffix + '_week_normalized'] = x.dt.week / 52
d[suffix + '_week_str'] = '"' + x.dt.week.apply(lambda x:np.nan if np.isnan(x) else str(x).replace(".0", "")) + '"'
d[suffix + '_year_after_min_year'] = x.dt.year - x.dt.year.min()
d[suffix + '_year_str'] = '"' + x.dt.year.apply(lambda x:np.nan if np.isnan(x) else str(x).replace(".0", "")) + '"'
d[suffix + '_day_name'] = x.dt.day_name()
d[suffix + '_day_after_min_date_str'] = '"' + (x - x.min()).apply(lambda x: str(x).split()[0]) + '"'
d[suffix + '_day_normalized'] = x.dt.day / 31
d[suffix + '_hour_normalized'] = x.dt.hour / 24
d[suffix + '_hour_str'] = '"' + x.dt.hour.apply(lambda x:np.nan if np.isnan(x) else str(x).replace(".0", "")) + '"'
d[suffix + '_month_name'] = x.dt.month_name()
d[suffix + '_month_normalized'] = x.dt.month/12
for k,v in d.items():
if v.nunique() > 1:
df[k] = v
return df.drop(columns=x.name)
# return df
len_df_before_adding_date_vars = df.shape[1]
for date_col in date_columns:
df = add_new_date_cols(df[date_col], date_col)
len_df_after_adding_date_vars = df.shape[1]
if len_df_after_adding_date_vars > len_df_before_adding_date_vars:
new_line()
to_print = f"Added {len_df_after_adding_date_vars - len_df_before_adding_date_vars} date Features"
print(colored(to_print, 'red'))
# ======= type casting of numerical variable (those who have < 4% unique values) to catagorical variables
f = (df.select_dtypes("number").nunique() / len(df) * 100).where(lambda x:x<4).dropna().index
if f.size:
len_df_before_adding_date_vars = df.shape[1]
for col_num_to_str in f:
df[col_num_to_str+"_str"] = '"' + df[col_num_to_str].astype(str) + '"'
len_df_after_adding_date_vars = df.shape[1]
new_line()
to_print = f"Added {len_df_after_adding_date_vars - len_df_before_adding_date_vars} String Features (Extracted from numerical variables)"
print(colored(to_print, 'red'))
# =======
def cluping_rare_cases_in_one_catagory(x):
global df
x = df[x]
orignal = x.copy("deep")
xx = x.value_counts()
xx = xx[xx< 10].index.to_list()
x = x.replace(xx , "Rare cases")
if x.value_counts()[-1] < 8:
x[x == "Rare cases"] = x.mode()[0] # agar "Rare cases" vali catogery me 8 sy bhi kam values hon to un ko most common value sy replace kar do
if x.nunique() == 1:
new_line()
to_print = f"The column <{x.name}> have only one unique value, We droped it from the data."
print(colored(to_print, 'red'))
# return orignal
df.drop(columns=x.name, inplace=True)
return None
return x
for var in df.select_dtypes("O").columns:
m = cluping_rare_cases_in_one_catagory(var)
if isinstance(m, pd.core.series.Series):
df[var] = m
new_line()
xx = (df == 'Rare cases').sum().sort_values().where(lambda x:x>0).dropna()
xx = pd.DataFrame({"Count" : xx,
"Ratio" : round(xx/len(df)*100, 4)})
print(f"<Rare case> catagory:\n{xx.to_string()}")
# ----------------------------------------------------------------------- END (Feature enginearing)
dtypes = DTYPES()
# ---------------------------------------------------- Correlation plot
new_line()
cor_df = df.select_dtypes('number').corr().abs()
mask = np.triu(np.ones_like(cor_df, dtype=bool));
f, ax = plt.subplots(figsize=(17, 10));
cmap = sns.color_palette("viridis", as_cmap=True);
plot_ = sns.heatmap(cor_df, mask=mask, cmap=cmap, vmax=.3, square=True, linewidths=.5, cbar_kws={"shrink": .5});
plot_.axes.set_title("abs (Correlation) plot",fontsize=25);
plt.show()
# ---------------------------------------------------------------------
#===
# m = 0
for row in dtypes.iterrows():
# m += 1
# if m == 3:
# break
column_name, type_ = row[1]
x = df[column_name]
to_print = f"\n\n\n========================================= {column_name} =========================================\n\n"
print(colored(to_print, 'red'))
for col_ in df.columns:
if col_ == column_name:
continue
if df[col_].nunique() == df[column_name].nunique():
unique_combination = df[[col_, column_name]].drop_duplicates()
if unique_combination.apply(lambda x:x.is_unique).sum() == 2:
new_line()
to_print = f"This Columns is duplicate of <{col_}> column"
print(colored(to_print, 'red'))
# print(f"Column Type : {type_}")
print(f"Column Type : ", end="")
print(colored(type_, 'red'))
if x.isna().all():
new_line()
df.drop(columns=column_name, inplace=True)
print(colored("We dropped This column, because it is all Empty", 'red'))
continue
if type_ in ["O", "Date"]:
if x.is_unique:
new_line()
df.drop(columns=column_name, inplace=True)
to_print = f"We dropped This column, because it's a {type_} columns, and it's all values are unique"
print(colored(to_print, 'red'))
continue
if x.nunique() == 1:
new_line()
df.drop(columns=column_name, inplace=True)
print(colored("We dropped This column, because There is only one unique value", 'red'))
continue
if type_ == "Number":
local_cor = cor_df[column_name].drop(column_name).reset_index()
local_cor = local_cor.reindex(local_cor[column_name].abs().sort_values().index)
if local_cor[column_name].max() == 1:
new_line()
to_print = f"This column is perfactly correlated with column <{local_cor[local_cor[column_name] == 1]['index'].values[0]}, so remove one of them"
print(colored(to_print, 'red'))
new_line()
xm = local_cor[-3:].rename(columns={'index' : 'Column name', column_name : 'Correlation'}).reset_index(drop=True)
xm.index = xm['Column name']
xm.drop(columns="Column name", inplace=True);
xm.plot(kind='barh', grid=True, figsize=(10,1.5));
plt.title("Most 3 correlated features with this columns (sorted)", size=14);
plt.xlabel("Correlation", size=12);
plt.show();
new_line()
skewness = x.skew(skipna = True)
if abs(skewness) < 0.5:
print(f"The data is fairly symmetrical (skewness is: {skewness})")
elif abs(skewness) < 1:
print(f"The data are moderately skewed (skewness is: {skewness})")
else:
to_print = f"The data are highly skewed (skewness is: {skewness})\nNote: When skewness exceed |1| we called it highly skewed"
print(colored(to_print, 'red'))
# f = x.describe()
# f['Nunique'] = x.nunique()
# f['Nunique ratio'] = f.loc["Nunique"] / f.loc["count"] * 100
# f['Outlies count'] = (((x - x.mean())/x.std()).abs() > 3).sum()
# f['Outlies ratio'] = f.loc["Outlies count"] / f.loc["count"] * 100
# f['Nagative values count'] = (x < 0).sum()
# f['Nagative values ratio'] = f['Nagative values count'] / f['count'] * 100
ff = [x.count(), x.isna().sum(), x.mean(), x.std(), x.min()]
ff += x.quantile([.25,.5,.75]).to_list()
ff += [x.max(), x.nunique(), (((x - x.mean())/x.std()).abs() > 3).sum(), (x < 0).sum(), (x == 0).sum()]
f = pd.DataFrame(ff, index=['Count', 'NA', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max', 'Nunique', 'Outlies', 'Nagetive', 'Zeros'], columns=['Count'])
f['Ratio'] = f.Count / x.count() * 100
f.loc['Mean' : 'Max', 'Ratio'] = None
new_line()
print(f.round(2).to_string())
plot_numerical_columns(column_name)
elif type_ == "Object":
# f = x.describe()
# f = x.agg(['count', pd.Series.nunique])
# f['len'] = len(x)
# f['Na count'] = x.isna().sum()
# f['Na ratio'] = f['Na count'] / f['count'] * 100
# f['Most frequent'] = x.mode().values[0]
# f['Most frequent count'] = (x == f['Most frequent']).sum()
# f['Most frequent ratio'] = f['Most frequent count'] / f['count'] * 100
# f['Least frequent'] = x.value_counts().tail(1).index[0]
# f['Least frequent count'] = (x == f['Least frequent']).sum()
# f['Least frequent ratio'] = f['Least frequent count'] / f['count'] * 100
# f['Values occured only once count'] = x.value_counts().where(lambda x:x==1).dropna().size
# f['Values occured only once Ratio'] = f['Values occured only once count'] / x.count() * 100
l = x.count(), x.nunique(), len(x), x.isna().sum(), (x == x.mode().values[0]).sum(), (x == x.value_counts().tail(1).index[0]).sum(), x.value_counts().where(lambda x:x==1).dropna().size
f = pd.DataFrame(l, index=['Count', 'Nunique', 'Len', 'NA', 'Most frequent', 'Least frequent', 'Values occured only once'], columns=['Counts'])
f['Ratio'] = (f.Counts / x.count() * 100).round(4)
f.loc[['Len'], 'Ratio'] = None
new_line()
print(f.to_string())
if x.str.lower().nunique() != x.nunique():
new_line()
to_print = f"Case issue\n\tin orignal variable There are {x.nunique()} unique values\n\tin lower verstion there are {x.str.lower().nunique()} unique values.\n"
print(colored(to_print, 'red'))
if x.str.strip().nunique() != x.nunique():
new_line()
to_print = f"Space issue\n\tin orignal variable There are {x.nunique()} unique values\n\tin striped verstion there are {x.str.strip().nunique()} unique values."
print(colored(to_print, 'red'))
plot_catagorical_columns(column_name)
elif type == "Date":
new_line()
rd = relativedelta.relativedelta( pd.to_datetime(x.max()), pd.to_datetime(x.min()))
to_print = f"Diffrenece between first and last date:\n\tYears : {rd.years}\n\tMonths: {rd.months}\n\tDays : {rd.days}"
print(colored(to_print, 'red'))
# f = pd.Series({'Count' : x.count(),
# 'Nunique count' : x.nunique(),
# 'Nunique ratio' : x.nunique() / x.count() * 100,
# 'Most frequent value' : str(x.mode()[0]),
# 'Least frequent value' : x.value_counts().tail(1).index[0]
# })
# f['Most frequent count'] = (x == f['Most frequent value']).sum()
# f['Most frequent ratio'] = f['Most frequent count'] / f['Count'] * 100
# f['Least frequent count'] = (x == f['Least frequent value']).sum()
# f['Least frequent ratio'] = f['Least frequent count'] / f['Count'] * 100
# f['Values occured only once count'] = x.value_counts().where(lambda x:x==1).dropna().size
# f['Values occured only once Ratio'] = f['Values occured only once count'] / x.count() * 100
ff = x.count(), x.nunique(), (x == x.mode().values[0]).sum(), (x == x.value_counts().tail(1).index[0]).sum(), x.value_counts().where(lambda x:x==1).dropna().size
f = pd.DataFrame(ff, index=["Count", 'Nunique', 'Most frequent values', 'Least frequent values', 'Values occured only once count'], columns=['Counts'])
f['Ratio'] = (f.Counts / x.count() * 100).round(4)
new_line()
print(f"\n{f.to_string()}")
f = set(np.arange(x.dt.year.min(),x.dt.year.max()+1)).difference(
x.dt.year.unique())
if f:
new_line()
print(colored("These Years (in order) are missing:\n", 'red'))
for i in f:
print("\t", i, end=", ")
f = set(np.arange(x.dt.month.min(),x.dt.month.max()+1)).difference(
x.dt.month.unique())
if f:
new_line()
print(colored("These Months (in order) are missing:\n", 'red'))
for i in f:
print("\t", i, end=", ")
f = set(np.arange(x.dt.day.min(),x.dt.day.max()+1)).difference(
x.dt.day.unique())
if f:
new_line()
print(colored("These Days (in order) are missing:\n", 'red'))
for i in f:
print("\t", i, end=", ")
new_line()
plot_date_columns(column_name)
# ================================================================================================================ Modeling
print("\n\n")
print("----------------------------------------------------------------------------------------------")
print("****************************************** Modeling ******************************************")
# Regression problem
if df[target_variable].dtype in [float, int]:
print("\n-------------------- This is Regression problem --------------------\n")
print("''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''")
df_T = df.select_dtypes("number")
cat_cols = pd.get_dummies(df.select_dtypes(exclude="number"), prefix_sep="__")
df_T[cat_cols.columns.to_list()] = cat_cols
df = df_T.copy("deep")
del df_T
del cat_cols
# ====
train_X, test_X, train_y, test_y = train_test_split(df.drop(columns=target_variable), df[target_variable])
# ====
# --------------------------------------------------------- Linear regression
to_print = "\n ------------------------------------- Linear Regression -------------------------------------\n"
print(colored(to_print, 'red'))
model_reg = OLS(train_y, train_X).fit()
summary = model_reg.summary()
summary_df = pd.DataFrame(summary.tables[1])
summary_df.columns = summary_df.iloc[0]
summary_df.drop(0, inplace=True)
summary_df.columns = summary_df.columns.astype(str)
summary_df.columns = ["Variable"] + summary_df.columns[1:].to_list()
for i in summary_df.columns[1:]:
summary_df[i] = summary_df[i].astype(str).astype(float)
summary_df.Variable = summary_df.Variable.astype(str)
summary_df['Indicator'] = summary_df['P>|t|'].apply(lambda x:"***" if x < 0.001 else "**" if x < 0.01 else "*" if x < 0.05 else "." if x < 0.1 else "")
summary_df = summary_df.sort_values("Variable").reset_index(drop=True)
summary_df.to_csv()
new_line()
print(colored("NOTE: This summary saved as <summary_OLS_1.csv>", 'red'))
new_line()
print(summary_df.to_string())
# ============================= Model statistic
predictions = model_reg.predict(test_X)
new_line()
print(colored(" --- Model statistic --- \n", 'red'))
print(f"R-squared : {round(model_reg.rsquared, 3)}")
print(f"Adj. R-squared : {round(model_reg.rsquared_adj, 3)}")
print(f"F-statistic : {round(model_reg.fvalue)}")
print(f"Prob (F-statistic): {model_reg.f_pvalue}")
print(f"No. Observations : {round(model_reg.nobs)}")
print(f"AIC : {round(model_reg.aic)}")
print(f"Df Residuals : {round(model_reg.df_resid)}")
print(f"BIC : {round(model_reg.bic)}")
print(f"RMSE (test) : {RMSE(predictions)}")
# ======
f = train_X.copy("deep")
f['Errors__'] = model_reg.resid
f = f.corr()['Errors__'].drop("Errors__").abs().sort_values().dropna().tail(1)
new_line()
print(f"Maximum correlation between Reseduals and any data columns is {f.values[0]}, with columns <{f.index[0]}>")
print(f"Mean of train reseduals: {model_reg.resid.mean()}")
del f
# ============================= END (Model statistic)
# --------------------------------------------------------- END Linear regression
# --------------------------------------------------------- Random Forest
print("\n ------------------------------------- Random Forest -------------------------------------\n")
rf = RandomForestRegressor(n_estimators = 200, oob_score=True)
model_rf = rf.fit(train_X, train_y);
predictions_rf = rf.predict(test_X)
new_line()
print(colored("RF model peramters:\n", 'red'))
pprint.pprint(model_rf.get_params())
new_line()
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(test_X, importances)]
featuresImportance = pd.Series(model_rf.feature_importances_, index=train_X.columns).sort_values(ascending=False)
if len(featuresImportance) > 30:
featuresImportance = featuresImportance.head(30)
featuresImportance.plot(figsize=(20,10), kind='bar', grid=True);
plt.title("RandomForest Feature importances Graph", size=18,color='red');
plt.xlabel("Features", size=14, color='red');
plt.ylabel("Importance", size=14, color='red');
plt.show();
del featuresImportance
new_line()
print(colored("--- Model statistic ---", 'red'))
# The coefficient of determination R^2 of the prediction.
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
print(f"R^2 (test) : {rf.score(test_X, test_y)}")
print(f"R^2 (train): {rf.score(train_X, train_y)}")
print(f"RMSE (test): {RMSE(predictions_rf)}")
print(f"oob score : {model_rf.oob_score_}")
f = test_X.copy("deep")
errors_rf = predictions_rf - test_y
f['Errors__'] = errors_rf
f = f.corr()['Errors__'].drop("Errors__").abs().sort_values().dropna().tail(1)
new_line()
print(f"Maximum correlation between Reseduals and any data columns is {f.values[0]}, with columns <{f.index[0]}>")
# --------------------------------------------------------- END Random Forest
elif df[target_variable].dtype == "O":
# Classififcation problem
if df[target_variable].nunique() == 2:
print("\n-------------------- This is Binary Classification problem --------------------\n")
print("''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''")
df = pd.concat([
df.select_dtypes(exclude = "O"),
pd.get_dummies(df.drop(columns=target_variable).select_dtypes("O")),
df[[target_variable]]
], 1)
train_X, test_X, train_y, test_y = train_test_split(df.drop(columns=target_variable), df[target_variable])
clf = LogisticRegression().fit(train_X, train_y)
predictions = clf.predict_proba(test_X)
predictions = pd.Series(predictions[:, 0])
lst = []
for thresh in np.linspace(predictions.min(), predictions.max(), 50)[1:]:
pred = predictions < thresh
pred.loc[pred == True] = clf.classes_[0]
pred.loc[pred == False] = clf.classes_[1]
test_y = test_y.reset_index(drop=True)
TN = ((pred == clf.classes_[0]) & (test_y == clf.classes_[0])).sum()
TP = ((pred == clf.classes_[1]) & (test_y == clf.classes_[1])).sum()
FN = ((pred == clf.classes_[0]) & (test_y == clf.classes_[1])).sum()
FP = ((pred == clf.classes_[1]) & (test_y == clf.classes_[0])).sum()
p = TP / (TP + FP)
r = TP / (TP + FN)
f = 2 * ((p * r) / (p+r))
lst.append((thresh, (pred == test_y).mean(), p, r , f))
d = pd.DataFrame(lst, columns=["Thresold", "Accurecy(0-1)", "Precision", "Recall", "F1"])
d = d.set_index("Thresold")
d.plot(grid=True, figsize=(18,7));
plt.title("Model performance at diffrent Thresolds", size=18, color='red');
plt.xlabel("Thresold", size=14, color='red');
plt.ylabel("");
plt.show()
else:
to_print = "\n-------------------- This is Multiclass Classification problem --------------------\n"
print(colored(to_print, 'red'))
print("'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''")
df.loc[:, df.select_dtypes("O").columns] = df.select_dtypes("O").apply(lambda x: pd.Series(LabelEncoder().fit_transform(x.astype(str))).astype(str))
train_X, test_X, train_y, test_y = train_test_split(df.drop(columns=target_variable), df[target_variable])
clf=RandomForestClassifier(n_estimators=1000).fit(train_X, train_y)
predictions = clf.predict(test_X)
feature_imp = pd.Series(clf.feature_importances_,index=train_X.columns).sort_values(ascending=False)
if feature_imp.size > 30:
feature_imp = feature_imp.head(30)
feature_imp.plot(kind='barh', figsize=(17,10), grid=True);
plt.title("Feature importances Graph", size=18, color='red');
plt.xlabel("Importance", size=14, color='red');
plt.ylabel("Feature", size=14, color='red');
plt.show()
# ====
f = (test_y, predictions)
f_int = (test_y.astype(int), predictions.astype(int))
print(f"accuracy_score: {metrics.accuracy_score(*f)}")
print(f"f1_score: {metrics.f1_score(*f_int)}")
metrics.plot_roc_curve(clf, test_X, test_y);
plt.title("ROC curve plot");
plt.show();
metrics.ConfusionMatrixDisplay(metrics.confusion_matrix(*f)); plt.show()
metrics.plot_confusion_matrix(clf, test_X, test_y);
plt.title("Confusion matrix");
plt.show()
metrics.plot_precision_recall_curve(clf, test_X, test_y);
plt.title("Precision recall curve");
plt.show()
# ================================================================================================================ END Modeling
------------------------- The Data have: 2919 rows 81 columns ------------------------- Columns types distribution: object 43 int64 26 float64 12 dtype: int64
------------------------- There are 1459 NAs in target values, we droped those rows ------------------------- There are 19 (out of 81, [23%]) columns that contains 1 or more NA. ------------------------- 19 NA_indicator variables added to the data ========= NA Graphs =========
-------------------------
------------------------- NA columns data type Distribution: object 16 float64 3 dtype: int64 ------------------------- NaN Ratio (0-100) Electrical 0.068493 MasVnrType 0.547945 MasVnrArea 0.547945 BsmtQual 2.534247 BsmtCond 2.534247 BsmtFinType1 2.534247 BsmtExposure 2.602740 BsmtFinType2 2.602740 GarageCond 5.547945 GarageQual 5.547945 GarageFinish 5.547945 GarageType 5.547945 GarageYrBlt 5.547945 LotFrontage 17.739726 FireplaceQu 47.260274 Fence 80.753425 Alley 93.767123 MiscFeature 96.301370 PoolQC 99.520548 dtype: float64 ------------------------- (Before Missing values treatment) There are 6965 Missing values: 6617 in catagorical variables 348 in numerical columns 0.0 in others (After filling numeric missing values) There are 6617 Missing values: 6617 in catagorical variables 0 in numerical columns 0.0 in others ------------------------- Missing values imputed, Now there are 0 Missing values ------------------------- There are 1 column that have all unique values, so no value repeatation, we droped it column. Dropped column name are: Id ------------------------- Now The Data have: 1460 rows 99 columns ------------------------- Added 18 String Features (Extracted from numerical variables) ------------------------- The column <Street> have only one unique value, We droped it from the data. ------------------------- The column <Utilities> have only one unique value, We droped it from the data. ------------------------- The column <Electrical_NA_indicator> have only one unique value, We droped it from the data. ------------------------- The column <PoolQC_NA_indicator> have only one unique value, We droped it from the data. ------------------------- The column <PoolArea_str> have only one unique value, We droped it from the data. ------------------------- <Rare case> catagory: Count Ratio HouseStyle 8.0 0.5479 MasVnrType_NA_indicator 8.0 0.5479 MasVnrArea_NA_indicator 8.0 0.5479 FullBath_str 9.0 0.6164 Foundation 9.0 0.6164 RoofStyle 9.0 0.6164 MiscFeature 10.0 0.6849 Neighborhood 11.0 0.7534 Heating 14.0 0.9589 BedroomAbvGr_str 14.0 0.9589 Condition1 15.0 1.0274 Condition2 15.0 1.0274 RoofMatl 15.0 1.0274 Exterior2nd 17.0 1.1644 3SsnPorch_str 24.0 1.6438 LowQualFinSF_str 26.0 1.7808 SaleType 28.0 1.9178 MiscVal_str 41.0 2.8082 -------------------------
========================================= MSZoning ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1151 78.8356 Least frequent 10 0.6849 Values occured only once 0 0.0000
========================================= Alley ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 835 57.1918 Least frequent 625 42.8082 Values occured only once 0 0.0000
========================================= LotShape ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 925 63.3562 Least frequent 10 0.6849 Values occured only once 0 0.0000
========================================= LandContour ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 1311 89.7945 Least frequent 36 2.4658 Values occured only once 0 0.0000
========================================= LotConfig ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 1056 72.3288 Least frequent 47 3.2192 Values occured only once 0 0.0000
========================================= LandSlope ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1382 94.6575 Least frequent 13 0.8904 Values occured only once 0 0.0000
========================================= Neighborhood ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 24 1.6438 Len 1460 NaN NA 0 0.0000 Most frequent 225 15.4110 Least frequent 11 0.7534 Values occured only once 0 0.0000
========================================= Condition1 ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 7 0.4795 Len 1460 NaN NA 0 0.0000 Most frequent 1260 86.3014 Least frequent 11 0.7534 Values occured only once 0 0.0000
========================================= Condition2 ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1445 98.9726 Least frequent 15 1.0274 Values occured only once 0 0.0000
========================================= BldgType ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1220 83.5616 Least frequent 31 2.1233 Values occured only once 0 0.0000
========================================= HouseStyle ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 8 0.5479 Len 1460 NaN NA 0 0.0000 Most frequent 726 49.7260 Least frequent 8 0.5479 Values occured only once 0 0.0000
========================================= RoofStyle ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1141 78.1507 Least frequent 9 0.6164 Values occured only once 0 0.0000
========================================= RoofMatl ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1434 98.2192 Least frequent 11 0.7534 Values occured only once 0 0.0000
========================================= Exterior1st ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 10 0.6849 Len 1460 NaN NA 0 0.0000 Most frequent 522 35.7534 Least frequent 20 1.3699 Values occured only once 0 0.0000
========================================= Exterior2nd ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 12 0.8219 Len 1460 NaN NA 0 0.0000 Most frequent 504 34.5205 Least frequent 10 0.6849 Values occured only once 0 0.0000
========================================= MasVnrType ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 864 59.1781 Least frequent 16 1.0959 Values occured only once 0 0.0000
========================================= ExterQual ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 906 62.0548 Least frequent 14 0.9589 Values occured only once 0 0.0000
========================================= ExterCond ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1286 88.0822 Least frequent 28 1.9178 Values occured only once 0 0.0000
========================================= Foundation ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 647 44.3151 Least frequent 9 0.6164 Values occured only once 0 0.0000
========================================= BsmtQual ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 663 45.4110 Least frequent 52 3.5616 Values occured only once 0 0.0000
========================================= BsmtCond ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1340 91.7808 Least frequent 54 3.6986 Values occured only once 0 0.0000
========================================= BsmtExposure ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 956 65.4795 Least frequent 114 7.8082 Values occured only once 0 0.0000
========================================= BsmtFinType1 ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 6 0.4110 Len 1460 NaN NA 0 0.0000 Most frequent 467 31.9863 Least frequent 74 5.0685 Values occured only once 0 0.0000
========================================= BsmtFinType2 ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 6 0.4110 Len 1460 NaN NA 0 0.0000 Most frequent 1293 88.5616 Least frequent 14 0.9589 Values occured only once 0 0.0000
========================================= Heating ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1428 97.8082 Least frequent 14 0.9589 Values occured only once 0 0.0000
========================================= HeatingQC ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 742 50.8219 Least frequent 49 3.3562 Values occured only once 0 0.0000
========================================= CentralAir ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1365 93.4932 Least frequent 95 6.5068 Values occured only once 0 0.0000
========================================= Electrical ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1339 91.7123 Least frequent 27 1.8493 Values occured only once 0 0.0000
========================================= KitchenQual ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 735 50.3425 Least frequent 39 2.6712 Values occured only once 0 0.0000
========================================= Functional ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1366 93.5616 Least frequent 14 0.9589 Values occured only once 0 0.0000
========================================= FireplaceQu ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 680 46.5753 Least frequent 33 2.2603 Values occured only once 0 0.0000
========================================= GarageType ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 910 62.3288 Least frequent 11 0.7534 Values occured only once 0 0.0000
========================================= GarageFinish ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 672 46.0274 Least frequent 354 24.2466 Values occured only once 0 0.0000
========================================= GarageQual ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1331 91.1644 Least frequent 15 1.0274 Values occured only once 0 0.0000
========================================= GarageCond ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 1361 93.2192 Least frequent 15 1.0274 Values occured only once 0 0.0000
========================================= PavedDrive ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1340 91.7808 Least frequent 30 2.0548 Values occured only once 0 0.0000
========================================= PoolQC ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 865 59.2466 Least frequent 26 1.7808 Values occured only once 0 0.0000
========================================= Fence ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 736 50.4110 Least frequent 39 2.6712 Values occured only once 0 0.0000
========================================= MiscFeature ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1371 93.9041 Least frequent 10 0.6849 Values occured only once 0 0.0000
========================================= SaleType ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 1267 86.7808 Least frequent 28 1.9178 Values occured only once 0 0.0000
========================================= SaleCondition ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1202 82.3288 Least frequent 12 0.8219 Values occured only once 0 0.0000
========================================= LotFrontage_NA_indicator ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1201 82.2603 Least frequent 259 17.7397 Values occured only once 0 0.0000
========================================= Alley_NA_indicator ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1369 93.7671 Least frequent 91 6.2329 Values occured only once 0 0.0000
========================================= MasVnrType_NA_indicator ========================================= ------------------------- This Columns is duplicate of <MasVnrArea_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1452 99.4521 Least frequent 8 0.5479 Values occured only once 0 0.0000
========================================= MasVnrArea_NA_indicator ========================================= ------------------------- This Columns is duplicate of <MasVnrType_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1452 99.4521 Least frequent 8 0.5479 Values occured only once 0 0.0000
========================================= BsmtQual_NA_indicator ========================================= ------------------------- This Columns is duplicate of <BsmtCond_NA_indicator> column ------------------------- This Columns is duplicate of <BsmtFinType1_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1423 97.4658 Least frequent 37 2.5342 Values occured only once 0 0.0000
========================================= BsmtCond_NA_indicator ========================================= ------------------------- This Columns is duplicate of <BsmtQual_NA_indicator> column ------------------------- This Columns is duplicate of <BsmtFinType1_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1423 97.4658 Least frequent 37 2.5342 Values occured only once 0 0.0000
========================================= BsmtExposure_NA_indicator ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1422 97.3973 Least frequent 38 2.6027 Values occured only once 0 0.0000
========================================= BsmtFinType1_NA_indicator ========================================= ------------------------- This Columns is duplicate of <BsmtQual_NA_indicator> column ------------------------- This Columns is duplicate of <BsmtCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1423 97.4658 Least frequent 37 2.5342 Values occured only once 0 0.0000
========================================= BsmtFinType2_NA_indicator ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1422 97.3973 Least frequent 38 2.6027 Values occured only once 0 0.0000
========================================= FireplaceQu_NA_indicator ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 770 52.7397 Least frequent 690 47.2603 Values occured only once 0 0.0000
========================================= GarageType_NA_indicator ========================================= ------------------------- This Columns is duplicate of <GarageYrBlt_NA_indicator> column ------------------------- This Columns is duplicate of <GarageFinish_NA_indicator> column ------------------------- This Columns is duplicate of <GarageQual_NA_indicator> column ------------------------- This Columns is duplicate of <GarageCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
========================================= GarageYrBlt_NA_indicator ========================================= ------------------------- This Columns is duplicate of <GarageType_NA_indicator> column ------------------------- This Columns is duplicate of <GarageFinish_NA_indicator> column ------------------------- This Columns is duplicate of <GarageQual_NA_indicator> column ------------------------- This Columns is duplicate of <GarageCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
========================================= GarageFinish_NA_indicator ========================================= ------------------------- This Columns is duplicate of <GarageType_NA_indicator> column ------------------------- This Columns is duplicate of <GarageYrBlt_NA_indicator> column ------------------------- This Columns is duplicate of <GarageQual_NA_indicator> column ------------------------- This Columns is duplicate of <GarageCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
========================================= GarageQual_NA_indicator ========================================= ------------------------- This Columns is duplicate of <GarageType_NA_indicator> column ------------------------- This Columns is duplicate of <GarageYrBlt_NA_indicator> column ------------------------- This Columns is duplicate of <GarageFinish_NA_indicator> column ------------------------- This Columns is duplicate of <GarageCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
========================================= GarageCond_NA_indicator ========================================= ------------------------- This Columns is duplicate of <GarageType_NA_indicator> column ------------------------- This Columns is duplicate of <GarageYrBlt_NA_indicator> column ------------------------- This Columns is duplicate of <GarageFinish_NA_indicator> column ------------------------- This Columns is duplicate of <GarageQual_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
========================================= Fence_NA_indicator ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1179 80.7534 Least frequent 281 19.2466 Values occured only once 0 0.0000
========================================= MiscFeature_NA_indicator ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1406 96.3014 Least frequent 54 3.6986 Values occured only once 0 0.0000
========================================= MSSubClass_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 14 0.9589 Len 1460 NaN NA 0 0.0000 Most frequent 540 36.9863 Least frequent 10 0.6849 Values occured only once 0 0.0000
========================================= OverallQual_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 8 0.5479 Len 1460 NaN NA 0 0.0000 Most frequent 402 27.5342 Least frequent 18 1.2329 Values occured only once 0 0.0000
========================================= OverallCond_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 7 0.4795 Len 1460 NaN NA 0 0.0000 Most frequent 827 56.6438 Least frequent 22 1.5068 Values occured only once 0 0.0000
========================================= LowQualFinSF_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1434 98.2192 Least frequent 26 1.7808 Values occured only once 0 0.0000
========================================= BsmtFullBath_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 857 58.6986 Least frequent 15 1.0274 Values occured only once 0 0.0000
========================================= BsmtHalfBath_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1380 94.5205 Least frequent 80 5.4795 Values occured only once 0 0.0000
========================================= FullBath_str ========================================= ------------------------- This Columns is duplicate of <FullBath> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 768 52.6027 Least frequent 9 0.6164 Values occured only once 0 0.0000
========================================= HalfBath_str ========================================= ------------------------- This Columns is duplicate of <HalfBath> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 913 62.5342 Least frequent 12 0.8219 Values occured only once 0 0.0000
========================================= BedroomAbvGr_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 6 0.4110 Len 1460 NaN NA 0 0.0000 Most frequent 804 55.0685 Least frequent 14 0.9589 Values occured only once 0 0.0000
========================================= KitchenAbvGr_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1395 95.5479 Least frequent 65 4.4521 Values occured only once 0 0.0000
========================================= TotRmsAbvGrd_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 10 0.6849 Len 1460 NaN NA 0 0.0000 Most frequent 404 27.6712 Least frequent 11 0.7534 Values occured only once 0 0.0000
========================================= Fireplaces_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 695 47.6027 Least frequent 115 7.8767 Values occured only once 0 0.0000
========================================= GarageCars_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 829 56.7808 Least frequent 81 5.5479 Values occured only once 0 0.0000
========================================= 3SsnPorch_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1436 98.3562 Least frequent 24 1.6438 Values occured only once 0 0.0000
========================================= MiscVal_str ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1408 96.4384 Least frequent 11 0.7534 Values occured only once 0 0.0000
========================================= MoSold_str ========================================= ------------------------- This Columns is duplicate of <MoSold> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 12 0.8219 Len 1460 NaN NA 0 0.0000 Most frequent 253 17.3288 Least frequent 52 3.5616 Values occured only once 0 0.0000
========================================= YrSold_str ========================================= ------------------------- This Columns is duplicate of <YrSold> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 338 23.1507 Least frequent 175 11.9863 Values occured only once 0 0.0000
========================================= MSSubClass ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.4076567471495591)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.0 100.00
NA 0.0 0.00
Mean 56.9 NaN
Std 42.3 NaN
Min 20.0 NaN
25% 20.0 NaN
50% 50.0 NaN
75% 70.0 NaN
Max 190.0 NaN
Nunique 15.0 1.03
Outlies 30.0 2.05
Nagetive 0.0 0.00
Zeros 0.0 0.00
========================================= LotFrontage ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 2.0120008521763144)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 70.75 NaN
Std 23.47 NaN
Min 21.00 NaN
25% 60.00 NaN
50% 70.00 NaN
75% 80.00 NaN
Max 313.00 NaN
Nunique 224.00 15.34
Outlies 14.00 0.96
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= LotArea ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 12.207687851233496)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 10516.83 NaN
Std 9981.26 NaN
Min 1300.00 NaN
25% 7553.50 NaN
50% 9478.50 NaN
75% 11601.50 NaN
Max 215245.00 NaN
Nunique 1073.00 73.49
Outlies 13.00 0.89
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= OverallQual ========================================= Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.2169439277628693)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 6.10 NaN
Std 1.38 NaN
Min 1.00 NaN
25% 5.00 NaN
50% 6.00 NaN
75% 7.00 NaN
Max 10.00 NaN
Nunique 10.00 0.68
Outlies 2.00 0.14
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= OverallCond ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.6930674724842182)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 5.58 NaN
Std 1.11 NaN
Min 1.00 NaN
25% 5.00 NaN
50% 5.00 NaN
75% 6.00 NaN
Max 9.00 NaN
Nunique 9.00 0.62
Outlies 28.00 1.92
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= YearBuilt ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: -0.613461172488183)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1971.27 NaN
Std 30.20 NaN
Min 1872.00 NaN
25% 1954.00 NaN
50% 1973.00 NaN
75% 2000.00 NaN
Max 2010.00 NaN
Nunique 112.00 7.67
Outlies 6.00 0.41
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= YearRemodAdd ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: -0.5035620027004709)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1984.87 NaN
Std 20.65 NaN
Min 1950.00 NaN
25% 1967.00 NaN
50% 1994.00 NaN
75% 2004.00 NaN
Max 2010.00 NaN
Nunique 61.00 4.18
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= MasVnrArea ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 2.6682455485578593)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 103.84 NaN
Std 180.74 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 166.00 NaN
Max 1600.00 NaN
Nunique 335.00 22.95
Outlies 32.00 2.19
Nagetive 0.00 0.00
Zeros 861.00 58.97
========================================= BsmtFinSF1 ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.685503071910789)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 443.64 NaN
Std 456.10 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 383.50 NaN
75% 712.25 NaN
Max 5644.00 NaN
Nunique 637.00 43.63
Outlies 6.00 0.41
Nagetive 0.00 0.00
Zeros 467.00 31.99
========================================= BsmtFinSF2 ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 4.255261108933303)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 46.55 NaN
Std 161.32 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 1474.00 NaN
Nunique 144.00 9.86
Outlies 50.00 3.42
Nagetive 0.00 0.00
Zeros 1293.00 88.56
========================================= BsmtUnfSF ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.9202684528039037)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 567.24 NaN
Std 441.87 NaN
Min 0.00 NaN
25% 223.00 NaN
50% 477.50 NaN
75% 808.00 NaN
Max 2336.00 NaN
Nunique 780.00 53.42
Outlies 11.00 0.75
Nagetive 0.00 0.00
Zeros 118.00 8.08
========================================= TotalBsmtSF ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.5242545490627664)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1057.43 NaN
Std 438.71 NaN
Min 0.00 NaN
25% 795.75 NaN
50% 991.50 NaN
75% 1298.25 NaN
Max 6110.00 NaN
Nunique 721.00 49.38
Outlies 10.00 0.68
Nagetive 0.00 0.00
Zeros 37.00 2.53
========================================= 1stFlrSF ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.3767566220336365)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1162.63 NaN
Std 386.59 NaN
Min 334.00 NaN
25% 882.00 NaN
50% 1087.00 NaN
75% 1391.25 NaN
Max 4692.00 NaN
Nunique 753.00 51.58
Outlies 12.00 0.82
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= 2ndFlrSF ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.8130298163023265)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 346.99 NaN
Std 436.53 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 728.00 NaN
Max 2065.00 NaN
Nunique 417.00 28.56
Outlies 4.00 0.27
Nagetive 0.00 0.00
Zeros 829.00 56.78
========================================= LowQualFinSF ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 9.011341288465387)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 5.84 NaN
Std 48.62 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 572.00 NaN
Nunique 24.00 1.64
Outlies 20.00 1.37
Nagetive 0.00 0.00
Zeros 1434.00 98.22
========================================= GrLivArea ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.3665603560164552)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1515.46 NaN
Std 525.48 NaN
Min 334.00 NaN
25% 1129.50 NaN
50% 1464.00 NaN
75% 1776.75 NaN
Max 5642.00 NaN
Nunique 861.00 58.97
Outlies 16.00 1.10
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= BsmtFullBath ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.596066609663168)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 0.43 NaN
Std 0.52 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 1.00 NaN
Max 3.00 NaN
Nunique 4.00 0.27
Outlies 16.00 1.10
Nagetive 0.00 0.00
Zeros 856.00 58.63
========================================= BsmtHalfBath ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 4.103402697955168)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 0.06 NaN
Std 0.24 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 2.00 NaN
Nunique 3.00 0.21
Outlies 82.00 5.62
Nagetive 0.00 0.00
Zeros 1378.00 94.38
========================================= FullBath ========================================= ------------------------- This Columns is duplicate of <FullBath_str> column Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.036561558402727165)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1.57 NaN
Std 0.55 NaN
Min 0.00 NaN
25% 1.00 NaN
50% 2.00 NaN
75% 2.00 NaN
Max 3.00 NaN
Nunique 4.00 0.27
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 9.00 0.62
========================================= HalfBath ========================================= ------------------------- This Columns is duplicate of <HalfBath_str> column Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.675897448233722)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 0.38 NaN
Std 0.50 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 1.00 NaN
Max 2.00 NaN
Nunique 3.00 0.21
Outlies 12.00 0.82
Nagetive 0.00 0.00
Zeros 913.00 62.53
========================================= BedroomAbvGr ========================================= Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.21179009627507137)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 2.87 NaN
Std 0.82 NaN
Min 0.00 NaN
25% 2.00 NaN
50% 3.00 NaN
75% 3.00 NaN
Max 8.00 NaN
Nunique 8.00 0.55
Outlies 14.00 0.96
Nagetive 0.00 0.00
Zeros 6.00 0.41
========================================= KitchenAbvGr ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 4.488396777072859)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1.05 NaN
Std 0.22 NaN
Min 0.00 NaN
25% 1.00 NaN
50% 1.00 NaN
75% 1.00 NaN
Max 3.00 NaN
Nunique 4.00 0.27
Outlies 68.00 4.66
Nagetive 0.00 0.00
Zeros 1.00 0.07
========================================= TotRmsAbvGrd ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.6763408364355531)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 6.52 NaN
Std 1.63 NaN
Min 2.00 NaN
25% 5.00 NaN
50% 6.00 NaN
75% 7.00 NaN
Max 14.00 NaN
Nunique 12.00 0.82
Outlies 12.00 0.82
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= Fireplaces ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.6495651830548841)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 0.61 NaN
Std 0.64 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 1.00 NaN
75% 1.00 NaN
Max 3.00 NaN
Nunique 4.00 0.27
Outlies 5.00 0.34
Nagetive 0.00 0.00
Zeros 690.00 47.26
========================================= GarageYrBlt ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: -0.541264504372725)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1977.23 NaN
Std 24.78 NaN
Min 1900.00 NaN
25% 1960.00 NaN
50% 1978.00 NaN
75% 2001.00 NaN
Max 2010.00 NaN
Nunique 148.00 10.14
Outlies 1.00 0.07
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= GarageCars ========================================= Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: -0.3425489297486655)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1.77 NaN
Std 0.75 NaN
Min 0.00 NaN
25% 1.00 NaN
50% 2.00 NaN
75% 2.00 NaN
Max 4.00 NaN
Nunique 5.00 0.34
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 81.00 5.55
========================================= GarageArea ========================================= Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.17998090674623907)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 472.98 NaN
Std 213.80 NaN
Min 0.00 NaN
25% 334.50 NaN
50% 480.00 NaN
75% 576.00 NaN
Max 1418.00 NaN
Nunique 441.00 30.21
Outlies 7.00 0.48
Nagetive 0.00 0.00
Zeros 81.00 5.55
========================================= WoodDeckSF ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.5413757571931312)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 94.24 NaN
Std 125.34 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 168.00 NaN
Max 857.00 NaN
Nunique 274.00 18.77
Outlies 22.00 1.51
Nagetive 0.00 0.00
Zeros 761.00 52.12
========================================= OpenPorchSF ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 2.3643417403694404)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 46.66 NaN
Std 66.26 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 25.00 NaN
75% 68.00 NaN
Max 547.00 NaN
Nunique 202.00 13.84
Outlies 27.00 1.85
Nagetive 0.00 0.00
Zeros 656.00 44.93
========================================= EnclosedPorch ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 3.08987190371177)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 21.95 NaN
Std 61.12 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 552.00 NaN
Nunique 120.00 8.22
Outlies 51.00 3.49
Nagetive 0.00 0.00
Zeros 1252.00 85.75
========================================= 3SsnPorch ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 10.304342032693112)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 3.41 NaN
Std 29.32 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 508.00 NaN
Nunique 20.00 1.37
Outlies 23.00 1.58
Nagetive 0.00 0.00
Zeros 1436.00 98.36
========================================= ScreenPorch ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 4.122213743143115)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 15.06 NaN
Std 55.76 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 480.00 NaN
Nunique 76.00 5.21
Outlies 55.00 3.77
Nagetive 0.00 0.00
Zeros 1344.00 92.05
========================================= PoolArea ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 14.828373640750588)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 2.76 NaN
Std 40.18 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 738.00 NaN
Nunique 8.00 0.55
Outlies 7.00 0.48
Nagetive 0.00 0.00
Zeros 1453.00 99.52
========================================= MiscVal ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 24.476794188821916)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 43.49 NaN
Std 496.12 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 15500.00 NaN
Nunique 21.00 1.44
Outlies 8.00 0.55
Nagetive 0.00 0.00
Zeros 1408.00 96.44
========================================= MoSold ========================================= ------------------------- This Columns is duplicate of <MoSold_str> column Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.21205298505146022)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 6.32 NaN
Std 2.70 NaN
Min 1.00 NaN
25% 5.00 NaN
50% 6.00 NaN
75% 8.00 NaN
Max 12.00 NaN
Nunique 12.00 0.82
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= YrSold ========================================= ------------------------- This Columns is duplicate of <YrSold_str> column Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.09626851386568028)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 2007.82 NaN
Std 1.33 NaN
Min 2006.00 NaN
25% 2007.00 NaN
50% 2008.00 NaN
75% 2009.00 NaN
Max 2010.00 NaN
Nunique 5.00 0.34
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 0.00 0.00
========================================= SalePrice ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.8828757597682129)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.0 100.00
NA 0.0 0.00
Mean 180921.2 NaN
Std 79442.5 NaN
Min 34900.0 NaN
25% 129975.0 NaN
50% 163000.0 NaN
75% 214000.0 NaN
Max 755000.0 NaN
Nunique 663.0 45.41
Outlies 22.0 1.51
Nagetive 0.0 0.00
Zeros 0.0 0.00
---------------------------------------------------------------------------------------------- ****************************************** Modeling ****************************************** -------------------- This is Regression problem -------------------- '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' ------------------------------------- Linear Regression ------------------------------------- ------------------------- NOTE: This summary saved as <summary_OLS_1.csv> ------------------------- Variable coef std err t P>|t| [0.025 0.975] Indicator 0 1stFlrSF 3.3533 12.672 0.265 0.791 -21.521 28.227 1 2ndFlrSF 12.8116 11.831 1.083 0.279 -10.411 36.034 2 3SsnPorch 62.6969 57.210 1.096 0.273 -49.597 174.990 3 3SsnPorch_str__"0.0" 1234.7586 6832.778 0.181 0.857 -12200.000 14600.000 4 3SsnPorch_str__Rare cases -1239.5703 6832.838 -0.181 0.856 -14700.000 12200.000 5 Alley_NA_indicator__Missing -1085.4065 2249.147 -0.483 0.630 -5500.115 3329.302 6 Alley_NA_indicator__Not missing 1080.5948 2249.139 0.480 0.631 -3334.098 5495.288 7 Alley__Grvl -3292.4841 1581.348 -2.082 0.038 -6396.411 -188.557 * 8 Alley__Pave 3287.6724 1581.357 2.079 0.038 183.728 6391.617 * 9 BedroomAbvGr -2917.6677 5274.186 -0.553 0.580 -13300.000 7434.697 10 BedroomAbvGr_str__"1.0" 6236.4742 12700.000 0.490 0.624 -18700.000 31200.000 11 BedroomAbvGr_str__"2.0" 2998.9900 7567.986 0.396 0.692 -11900.000 17900.000 12 BedroomAbvGr_str__"3.0" -45.6206 3666.690 -0.012 0.990 -7242.734 7151.492 13 BedroomAbvGr_str__"4.0" 7076.2427 4786.581 1.478 0.140 -2319.033 16500.000 14 BedroomAbvGr_str__"5.0" -25690.0000 11600.000 -2.222 0.027 -48400.000 -3000.884 * 15 BedroomAbvGr_str__Rare cases 9414.4073 12500.000 0.756 0.450 -15000.000 33900.000 16 BldgType__1Fam -7985.6601 12900.000 -0.617 0.537 -33400.000 17400.000 17 BldgType__2fmCon 3088.0825 26800.000 0.115 0.908 -49600.000 55800.000 18 BldgType__Duplex -2900.9887 6260.765 -0.463 0.643 -15200.000 9387.871 19 BldgType__Twnhs -115.7867 14600.000 -0.008 0.994 -28800.000 28500.000 20 BldgType__TwnhsE 7909.5411 13600.000 0.580 0.562 -18900.000 34700.000 21 BsmtCond_NA_indicator__Missing -1686.3679 1149.566 -1.467 0.143 -3942.779 570.043 22 BsmtCond_NA_indicator__Not missing 1681.5562 1149.320 1.463 0.144 -574.371 3937.484 23 BsmtCond__Fa -1555.1252 3873.944 -0.401 0.688 -9159.044 6048.793 24 BsmtCond__Gd -956.1862 3339.981 -0.286 0.775 -7512.023 5599.650 25 BsmtCond__TA 2506.4996 2387.370 1.050 0.294 -2179.518 7192.517 26 BsmtExposure_NA_indicator__Missing -1686.3679 1149.566 -1.467 0.143 -3942.779 570.043 27 BsmtExposure_NA_indicator__Not missing 1681.5562 1149.320 1.463 0.144 -574.371 3937.484 28 BsmtExposure__Av -3420.6587 2177.236 -1.571 0.117 -7694.217 852.900 29 BsmtExposure__Gd 17150.0000 3032.425 5.655 0.000 11200.000 23100.000 *** 30 BsmtExposure__Mn -6291.4475 2836.058 -2.218 0.027 -11900.000 -724.730 * 31 BsmtExposure__No -7440.6207 1880.599 -3.957 0.000 -11100.000 -3749.313 *** 32 BsmtFinSF1 -3.8000 4.822 -0.788 0.431 -13.265 5.665 33 BsmtFinSF2 15.8494 12.050 1.315 0.189 -7.803 39.502 34 BsmtFinType1_NA_indicator__Missing -1686.3679 1149.566 -1.467 0.143 -3942.779 570.043 35 BsmtFinType1_NA_indicator__Not missing 1681.5562 1149.320 1.463 0.144 -574.371 3937.484 36 BsmtFinType1__ALQ 1715.1004 2385.417 0.719 0.472 -2967.083 6397.284 37 BsmtFinType1__BLQ 1345.2990 2797.656 0.481 0.631 -4146.042 6836.640 38 BsmtFinType1__GLQ 5541.1455 2624.123 2.112 0.035 390.420 10700.000 * 39 BsmtFinType1__LwQ -4415.2560 3937.358 -1.121 0.262 -12100.000 3313.135 40 BsmtFinType1__Rec 750.7314 2905.428 0.258 0.796 -4952.150 6453.612 41 BsmtFinType1__Unf -4941.8320 2656.778 -1.860 0.063 -10200.000 272.989 . 42 BsmtFinType2_NA_indicator__Missing -1686.3679 1149.566 -1.467 0.143 -3942.779 570.043 43 BsmtFinType2_NA_indicator__Not missing 1681.5562 1149.320 1.463 0.144 -574.371 3937.484 44 BsmtFinType2__ALQ -3704.7676 7524.530 -0.492 0.623 -18500.000 11100.000 45 BsmtFinType2__BLQ 2027.5973 5835.434 0.347 0.728 -9426.406 13500.000 46 BsmtFinType2__GLQ 3743.2330 8508.306 0.440 0.660 -13000.000 20400.000 47 BsmtFinType2__LwQ -2940.0163 5289.925 -0.556 0.579 -13300.000 7443.243 48 BsmtFinType2__Rec -4366.9981 4419.232 -0.988 0.323 -13000.000 4307.233 49 BsmtFinType2__Unf 5236.1399 4893.554 1.070 0.285 -4369.107 14800.000 50 BsmtFullBath -2110.4208 11500.000 -0.184 0.854 -24700.000 20400.000 51 BsmtFullBath_str__"0.0" 5264.9100 11800.000 0.447 0.655 -17800.000 28400.000 52 BsmtFullBath_str__"1.0" 12670.0000 4060.835 3.120 0.002 4699.509 20600.000 ** 53 BsmtFullBath_str__"2.0" -17940.0000 13300.000 -1.352 0.177 -44000.000 8104.623 54 BsmtHalfBath -5754.7820 12900.000 -0.446 0.656 -31100.000 19600.000 55 BsmtHalfBath_str__"0.0" -6809.4809 6713.388 -1.014 0.311 -20000.000 6367.804 56 BsmtHalfBath_str__"1.0" 6804.6691 6713.165 1.014 0.311 -6372.177 20000.000 57 BsmtQual_NA_indicator__Missing -1686.3679 1149.566 -1.467 0.143 -3942.779 570.043 58 BsmtQual_NA_indicator__Not missing 1681.5562 1149.320 1.463 0.144 -574.371 3937.484 59 BsmtQual__Ex 4381.7531 4145.575 1.057 0.291 -3755.334 12500.000 60 BsmtQual__Fa 2477.9285 4913.453 0.504 0.614 -7166.376 12100.000 61 BsmtQual__Gd -4594.7921 2609.322 -1.761 0.079 -9716.465 526.881 . 62 BsmtQual__TA -2269.7012 2554.383 -0.889 0.375 -7283.539 2744.136 63 BsmtUnfSF -7.4200 5.172 -1.435 0.152 -17.572 2.732 64 CentralAir__N -1657.8733 2821.606 -0.588 0.557 -7196.226 3880.479 65 CentralAir__Y 1653.0615 2821.659 0.586 0.558 -3885.394 7191.517 66 Condition1__Artery -5304.5805 5234.016 -1.013 0.311 -15600.000 4968.939 67 Condition1__Feedr -5092.7408 4322.090 -1.178 0.239 -13600.000 3390.816 68 Condition1__Norm 6516.3363 2759.200 2.362 0.018 1100.478 11900.000 * 69 Condition1__PosN 9265.9151 7054.506 1.313 0.189 -4580.929 23100.000 70 Condition1__RRAe -20840.0000 8788.252 -2.372 0.018 -38100.000 -3593.152 * 71 Condition1__RRAn 4027.1131 6063.362 0.664 0.507 -7874.277 15900.000 72 Condition1__Rare cases 11430.0000 8216.296 1.391 0.165 -4701.047 27600.000 73 Condition2__Norm 3280.6809 5208.090 0.630 0.529 -6941.950 13500.000 74 Condition2__Rare cases -3285.4926 5208.143 -0.631 0.528 -13500.000 6937.242 75 Electrical__FuseA -1218.9711 3496.674 -0.349 0.727 -8082.372 5644.429 76 Electrical__FuseF 4510.1330 5273.796 0.855 0.393 -5841.468 14900.000 77 Electrical__SBrkr -3295.9736 3167.700 -1.040 0.298 -9513.651 2921.704 78 EnclosedPorch 26.0007 17.333 1.500 0.134 -8.022 60.023 79 ExterCond__Fa 5446.2409 5444.979 1.000 0.317 -5241.363 16100.000 80 ExterCond__Gd -4886.0212 3426.984 -1.426 0.154 -11600.000 1840.589 81 ExterCond__TA -565.0314 2912.153 -0.194 0.846 -6281.112 5151.050 82 ExterQual__Ex 7912.3072 6369.204 1.242 0.214 -4589.399 20400.000 83 ExterQual__Fa -7393.1494 10600.000 -0.698 0.486 -28200.000 13400.000 84 ExterQual__Gd 3000.4397 4246.016 0.707 0.480 -5333.795 11300.000 85 ExterQual__TA -3524.4092 4204.279 -0.838 0.402 -11800.000 4727.903 86 Exterior1st__AsbShng 20590.0000 21300.000 0.965 0.335 -21300.000 62500.000 87 Exterior1st__BrkFace 8714.9218 7230.872 1.205 0.228 -5478.099 22900.000 88 Exterior1st__CemntBd 4514.6631 26600.000 0.170 0.865 -47800.000 56800.000 89 Exterior1st__HdBoard -2945.6319 6721.156 -0.438 0.661 -16100.000 10200.000 90 Exterior1st__MetalSd 943.3501 10500.000 0.090 0.928 -19600.000 21500.000 91 Exterior1st__Plywood 2388.9515 6814.090 0.351 0.726 -11000.000 15800.000 92 Exterior1st__Stucco -25930.0000 12400.000 -2.084 0.037 -50300.000 -1507.352 * 93 Exterior1st__VinylSd -10190.0000 7881.645 -1.293 0.196 -25700.000 5280.509 94 Exterior1st__Wd Sdng 2352.3249 6164.913 0.382 0.703 -9748.393 14500.000 95 Exterior1st__WdShing -446.8280 8364.569 -0.053 0.957 -16900.000 16000.000 96 Exterior2nd__AsbShng -27030.0000 20300.000 -1.328 0.184 -67000.000 12900.000 97 Exterior2nd__BrkFace 545.6956 9286.086 0.059 0.953 -17700.000 18800.000 98 Exterior2nd__CmentBd -3320.5107 27700.000 -0.120 0.905 -57700.000 51000.000 99 Exterior2nd__HdBoard 2969.5810 6471.508 0.459 0.646 -9732.933 15700.000 100 Exterior2nd__ImStucc 10310.0000 10400.000 0.987 0.324 -10200.000 30800.000 101 Exterior2nd__MetalSd -2047.8014 10600.000 -0.193 0.847 -22900.000 18800.000 102 Exterior2nd__Plywood -3891.5240 5989.464 -0.650 0.516 -15600.000 7864.816 103 Exterior2nd__Rare cases 620.0915 9203.667 0.067 0.946 -17400.000 18700.000 104 Exterior2nd__Stucco 8899.7887 12000.000 0.741 0.459 -14700.000 32500.000 105 Exterior2nd__VinylSd 10370.0000 7506.844 1.381 0.168 -4367.347 25100.000 106 Exterior2nd__Wd Sdng -586.5532 5819.852 -0.101 0.920 -12000.000 10800.000 107 Exterior2nd__Wd Shng 3160.2507 7486.242 0.422 0.673 -11500.000 17900.000 108 Fence_NA_indicator__Missing -1072.0532 1236.245 -0.867 0.386 -3498.600 1354.494 109 Fence_NA_indicator__Not missing 1067.2415 1236.279 0.863 0.388 -1359.371 3493.854 110 Fence__GdPrv -247.4536 2341.235 -0.106 0.916 -4842.915 4348.008 111 Fence__GdWo 175.8074 2394.677 0.073 0.941 -4524.552 4876.167 112 Fence__MnPrv 4573.9918 1961.358 2.332 0.020 724.167 8423.816 * 113 Fence__MnWw -4507.1574 4493.200 -1.003 0.316 -13300.000 4312.260 114 FireplaceQu_NA_indicator__Missing -418.7540 472.614 -0.886 0.376 -1346.418 508.911 115 FireplaceQu_NA_indicator__Not missing 413.9423 472.561 0.876 0.381 -513.618 1341.502 116 FireplaceQu__Ex -6437.8767 4713.071 -1.366 0.172 -15700.000 2813.112 117 FireplaceQu__Fa 847.0908 3182.469 0.266 0.790 -5399.576 7093.758 118 FireplaceQu__Gd 1250.5352 2026.104 0.617 0.537 -2726.376 5227.447 119 FireplaceQu__Po 2884.2039 3306.821 0.872 0.383 -3606.545 9374.953 120 FireplaceQu__TA 1451.2351 2031.248 0.714 0.475 -2535.773 5438.244 121 Fireplaces -21290.0000 4621.446 -4.607 0.000 -30400.000 -12200.000 *** 122 Fireplaces_str__"0.0" -26430.0000 4645.656 -5.688 0.000 -35500.000 -17300.000 *** 123 Fireplaces_str__"1.0" -3889.3006 1411.020 -2.756 0.006 -6658.902 -1119.700 ** 124 Fireplaces_str__"2.0" 30310.0000 5025.302 6.032 0.000 20400.000 40200.000 *** 125 Foundation__BrkTil 3354.3688 4646.104 0.722 0.471 -5765.174 12500.000 126 Foundation__CBlock 3792.7502 4010.990 0.946 0.345 -4080.167 11700.000 127 Foundation__PConc 6434.8297 4207.122 1.530 0.127 -1823.064 14700.000 128 Foundation__Rare cases -8527.0468 10800.000 -0.791 0.429 -29700.000 12600.000 129 Foundation__Slab -5059.7136 9965.919 -0.508 0.612 -24600.000 14500.000 130 FullBath 7199.5122 6168.302 1.167 0.243 -4907.857 19300.000 131 FullBath_str__"1.0" -13640.0000 8756.263 -1.558 0.120 -30800.000 3545.430 132 FullBath_str__"2.0" -16610.0000 3933.742 -4.222 0.000 -24300.000 -8885.256 *** 133 FullBath_str__"3.0" 18020.0000 3815.521 4.722 0.000 10500.000 25500.000 *** 134 FullBath_str__Rare cases 12230.0000 8665.817 1.411 0.159 -4784.260 29200.000 135 Functional__Maj1 -11520.0000 8675.229 -1.328 0.185 -28500.000 5510.540 136 Functional__Min1 -1110.2593 6317.184 -0.176 0.861 -13500.000 11300.000 137 Functional__Min2 1755.2182 5888.496 0.298 0.766 -9802.937 13300.000 138 Functional__Mod -2306.9353 8771.815 -0.263 0.793 -19500.000 14900.000 139 Functional__Typ 13170.0000 3893.289 3.384 0.001 5532.793 20800.000 ** 140 GarageArea -21.1991 11.653 -1.819 0.069 -44.072 1.674 . 141 GarageCars 21320.0000 7207.156 2.959 0.003 7175.950 35500.000 ** 142 GarageCars_str__"0.0" 2450.9042 1350.987 1.814 0.070 -200.863 5102.671 . 143 GarageCars_str__"1.0" 6984.3607 7024.437 0.994 0.320 -6803.461 20800.000 144 GarageCars_str__"2.0" -5461.2707 1745.080 -3.130 0.002 -8886.577 -2035.964 ** 145 GarageCars_str__"3.0" -3978.8059 7916.515 -0.503 0.615 -19500.000 11600.000 146 GarageCond_NA_indicator__Missing 2450.9042 1350.987 1.814 0.070 -200.863 5102.671 . 147 GarageCond_NA_indicator__Not missing -2455.7159 1350.984 -1.818 0.069 -5107.477 196.045 . 148 GarageCond__Fa 2161.9357 4778.123 0.452 0.651 -7216.739 11500.000 149 GarageCond__Gd -3452.5931 7309.252 -0.472 0.637 -17800.000 10900.000 150 GarageCond__Po -8412.5891 6823.765 -1.233 0.218 -21800.000 4981.346 151 GarageCond__TA 9698.4348 4136.154 2.345 0.019 1579.840 17800.000 * 152 GarageFinish_NA_indicator__Missing 2450.9042 1350.987 1.814 0.070 -200.863 5102.671 . 153 GarageFinish_NA_indicator__Not missing -2455.7159 1350.984 -1.818 0.069 -5107.477 196.045 . 154 GarageFinish__Fin 1947.3572 1732.519 1.124 0.261 -1453.295 5348.010 155 GarageFinish__RFn 169.2467 1478.740 0.114 0.909 -2733.278 3071.772 156 GarageFinish__Unf -2121.4157 1819.537 -1.166 0.244 -5692.870 1450.038 157 GarageQual_NA_indicator__Missing 2450.9042 1350.987 1.814 0.070 -200.863 5102.671 . 158 GarageQual_NA_indicator__Not missing -2455.7159 1350.984 -1.818 0.069 -5107.477 196.045 . 159 GarageQual__Fa -7215.6312 4739.775 -1.522 0.128 -16500.000 2087.773 160 GarageQual__Gd 8306.8742 6096.541 1.363 0.173 -3659.640 20300.000 161 GarageQual__TA -1096.0547 3643.802 -0.301 0.764 -8248.242 6056.133 162 GarageType_NA_indicator__Missing 2450.9042 1350.987 1.814 0.070 -200.863 5102.671 . 163 GarageType_NA_indicator__Not missing -2455.7159 1350.984 -1.818 0.069 -5107.477 196.045 . 164 GarageType__Attchd -3781.9543 3414.817 -1.108 0.268 -10500.000 2920.774 165 GarageType__Basment 6734.5572 7654.963 0.880 0.379 -8290.885 21800.000 166 GarageType__BuiltIn -9104.0738 4859.471 -1.873 0.061 -18600.000 434.275 . 167 GarageType__CarPort 3361.5580 9745.437 0.345 0.730 -15800.000 22500.000 168 GarageType__Detchd 2785.1012 3618.666 0.770 0.442 -4317.749 9887.952 169 GarageYrBlt -26.2058 84.117 -0.312 0.755 -191.313 138.902 170 GarageYrBlt_NA_indicator__Missing 2450.9042 1350.987 1.814 0.070 -200.863 5102.671 . 171 GarageYrBlt_NA_indicator__Not missing -2455.7159 1350.984 -1.818 0.069 -5107.477 196.045 . 172 GrLivArea 54.0179 12.266 4.404 0.000 29.941 78.094 *** 173 HalfBath -2823.3845 4441.158 -0.636 0.525 -11500.000 5893.882 174 HalfBath_str__"0.0" -1204.6132 1038.534 -1.160 0.246 -3243.085 833.859 175 HalfBath_str__"1.0" 5222.9874 4463.390 1.170 0.242 -3537.918 14000.000 176 HalfBath_str__"2.0" -4023.1860 4329.444 -0.929 0.353 -12500.000 4474.806 177 HeatingQC__Ex -1277.0599 2282.880 -0.559 0.576 -5757.980 3203.860 178 HeatingQC__Fa 3111.6972 4548.069 0.684 0.494 -5815.420 12000.000 179 HeatingQC__Gd -1511.9651 2311.765 -0.654 0.513 -6049.582 3025.652 180 HeatingQC__TA -327.4839 2085.814 -0.157 0.875 -4421.596 3766.628 181 Heating__GasA -1630.6700 5577.244 -0.292 0.770 -12600.000 9316.549 182 Heating__GasW 1183.3280 8255.308 0.143 0.886 -15000.000 17400.000 183 Heating__Rare cases 442.5303 8599.721 0.051 0.959 -16400.000 17300.000 184 HouseStyle__1.5Fin 2799.7658 12100.000 0.232 0.817 -20900.000 26500.000 185 HouseStyle__1.5Unf 14410.0000 21100.000 0.682 0.495 -27100.000 55900.000 186 HouseStyle__1Story 9975.3242 9679.832 1.031 0.303 -9024.604 29000.000 187 HouseStyle__2.5Unf -28630.0000 17900.000 -1.600 0.110 -63700.000 6489.714 188 HouseStyle__2Story -1020.5285 8835.439 -0.116 0.908 -18400.000 16300.000 189 HouseStyle__Rare cases -12580.0000 21600.000 -0.584 0.560 -54900.000 29700.000 190 HouseStyle__SFoyer 17740.0000 13300.000 1.329 0.184 -8449.598 43900.000 191 HouseStyle__SLvl -2696.5039 15500.000 -0.174 0.862 -33200.000 27800.000 192 KitchenAbvGr -11620.0000 11100.000 -1.045 0.296 -33400.000 10200.000 193 KitchenAbvGr_str__"1.0" -1107.6733 6875.684 -0.161 0.872 -14600.000 12400.000 194 KitchenAbvGr_str__"2.0" 1102.8615 6875.539 0.160 0.873 -12400.000 14600.000 195 KitchenQual__Ex 13290.0000 4059.540 3.273 0.001 5319.672 21300.000 ** 196 KitchenQual__Fa 1866.6839 5950.301 0.314 0.754 -9812.784 13500.000 197 KitchenQual__Gd -8384.4831 2678.421 -3.130 0.002 -13600.000 -3127.180 ** 198 KitchenQual__TA -6774.8987 2672.687 -2.535 0.011 -12000.000 -1528.851 * 199 LandContour__Bnk -7450.7425 4339.721 -1.717 0.086 -16000.000 1067.421 . 200 LandContour__HLS 8469.4449 4556.821 1.859 0.063 -474.850 17400.000 . 201 LandContour__Low -3556.6689 6141.224 -0.579 0.563 -15600.000 8497.551 202 LandContour__Lvl 2533.1548 3149.840 0.804 0.422 -3649.466 8715.776 203 LandSlope__Gtl 10450.0000 6155.984 1.698 0.090 -1632.559 22500.000 . 204 LandSlope__Mod 13480.0000 5956.494 2.263 0.024 1785.724 25200.000 * 205 LandSlope__Sev -23930.0000 10600.000 -2.252 0.025 -44800.000 -3072.523 * 206 LotArea 0.5264 0.167 3.149 0.002 0.198 0.855 ** 207 LotConfig__Corner 953.6597 2331.865 0.409 0.683 -3623.410 5530.729 208 LotConfig__CulDSac 8850.0494 3077.143 2.876 0.004 2810.121 14900.000 ** 209 LotConfig__FR2 -9638.9963 3871.384 -2.490 0.013 -17200.000 -2040.102 * 210 LotConfig__Inside -169.5245 1907.299 -0.089 0.929 -3913.241 3574.192 211 LotFrontage -102.5066 64.470 -1.590 0.112 -229.050 24.037 212 LotFrontage_NA_indicator__Missing 578.6439 1315.690 0.440 0.660 -2003.840 3161.127 213 LotFrontage_NA_indicator__Not missing -583.4556 1315.708 -0.443 0.658 -3165.976 1999.064 214 LotShape__IR1 8042.0548 3412.920 2.356 0.019 1343.051 14700.000 * 215 LotShape__IR2 2345.1283 5030.765 0.466 0.641 -7529.442 12200.000 216 LotShape__IR3 -19880.0000 8730.671 -2.277 0.023 -37000.000 -2738.846 * 217 LotShape__Reg 9483.7324 3592.427 2.640 0.008 2432.386 16500.000 ** 218 LowQualFinSF 37.8531 33.607 1.126 0.260 -28.111 103.817 219 LowQualFinSF_str__"0.0" 6051.7586 7010.819 0.863 0.388 -7709.334 19800.000 220 LowQualFinSF_str__Rare cases -6056.5703 7010.754 -0.864 0.388 -19800.000 7704.395 221 MSSubClass 371.9716 908.882 0.409 0.682 -1412.016 2155.959 222 MSSubClass_str__"120.0" -31800.0000 32000.000 -0.994 0.321 -94600.000 31000.000 223 MSSubClass_str__"160.0" -63070.0000 66600.000 -0.947 0.344 -194000.000 67700.000 224 MSSubClass_str__"180.0" -68040.0000 85200.000 -0.799 0.425 -235000.000 99200.000 225 MSSubClass_str__"190.0" -41950.0000 96800.000 -0.433 0.665 -232000.000 148000.000 226 MSSubClass_str__"20.0" 41870.0000 64100.000 0.653 0.514 -83900.000 168000.000 227 MSSubClass_str__"30.0" 32760.0000 56300.000 0.582 0.561 -77700.000 143000.000 228 MSSubClass_str__"45.0" 26740.0000 46200.000 0.579 0.563 -63900.000 117000.000 229 MSSubClass_str__"50.0" 23330.0000 37800.000 0.617 0.538 -50900.000 97600.000 230 MSSubClass_str__"60.0" 11150.0000 28200.000 0.395 0.693 -44200.000 66500.000 231 MSSubClass_str__"70.0" 20410.0000 21100.000 0.965 0.335 -21100.000 61900.000 232 MSSubClass_str__"75.0" 38020.0000 25700.000 1.477 0.140 -12500.000 88500.000 233 MSSubClass_str__"80.0" 21480.0000 18300.000 1.173 0.241 -14500.000 57400.000 234 MSSubClass_str__"85.0" -7987.7693 14700.000 -0.544 0.587 -36800.000 20800.000 235 MSSubClass_str__"90.0" -2900.9887 6260.765 -0.463 0.643 -15200.000 9387.871 236 MSZoning__C (all) -28550.0000 11900.000 -2.404 0.016 -51900.000 -5242.391 * 237 MSZoning__FV 20330.0000 7975.915 2.549 0.011 4676.916 36000.000 * 238 MSZoning__RH 3818.1883 8386.567 0.455 0.649 -12600.000 20300.000 239 MSZoning__RL 4334.1426 4438.319 0.977 0.329 -4377.552 13000.000 240 MSZoning__RM 60.6392 5026.645 0.012 0.990 -9805.843 9927.122 241 MasVnrArea 8.8044 8.307 1.060 0.290 -7.502 25.110 242 MasVnrArea_NA_indicator__Not missing 3783.0955 2906.879 1.301 0.193 -1922.632 9488.823 243 MasVnrArea_NA_indicator__Rare cases -3787.9072 2906.902 -1.303 0.193 -9493.681 1917.866 244 MasVnrType_NA_indicator__Not missing 3783.0955 2906.879 1.301 0.193 -1922.632 9488.823 245 MasVnrType_NA_indicator__Rare cases -3787.9072 2906.902 -1.303 0.193 -9493.681 1917.866 246 MasVnrType__BrkCmn -15160.0000 7227.306 -2.097 0.036 -29300.000 -969.060 * 247 MasVnrType__BrkFace 2572.3989 2916.086 0.882 0.378 -3151.402 8296.200 248 MasVnrType__None 5567.4248 3104.407 1.793 0.073 -526.020 11700.000 . 249 MasVnrType__Stone 7010.4457 3624.076 1.934 0.053 -103.024 14100.000 . 250 MiscFeature_NA_indicator__Missing -1342.9831 11200.000 -0.120 0.905 -23300.000 20600.000 251 MiscFeature_NA_indicator__Not missing 1338.1714 11200.000 0.119 0.905 -20600.000 23300.000 252 MiscFeature__Rare cases 193.2522 12500.000 0.015 0.988 -24400.000 24700.000 253 MiscFeature__Shed 4382.1547 6879.837 0.637 0.524 -9121.841 17900.000 254 MiscFeature__TenC -4580.2187 8242.321 -0.556 0.579 -20800.000 11600.000 255 MiscVal 4.5326 5.027 0.902 0.368 -5.335 14.400 256 MiscVal_str__"0.0" 6867.6642 15700.000 0.436 0.663 -24000.000 37800.000 257 MiscVal_str__"400.0" -2507.4066 10400.000 -0.241 0.809 -22900.000 17900.000 258 MiscVal_str__Rare cases -4365.0694 9388.219 -0.465 0.642 -22800.000 14100.000 259 MoSold -40.2678 343.999 -0.117 0.907 -715.481 634.946 260 MoSold_str__"1.0" -356.6960 3556.012 -0.100 0.920 -7336.566 6623.174 261 MoSold_str__"10.0" -4000.7022 3276.907 -1.221 0.222 -10400.000 2431.331 262 MoSold_str__"11.0" 5188.0677 3348.288 1.549 0.122 -1384.075 11800.000 263 MoSold_str__"12.0" -341.1583 3404.138 -0.100 0.920 -7022.926 6340.609 264 MoSold_str__"2.0" -2252.7998 3723.738 -0.605 0.545 -9561.890 5056.290 265 MoSold_str__"3.0" 227.8194 3089.357 0.074 0.941 -5836.084 6291.723 266 MoSold_str__"4.0" 228.5182 2864.891 0.080 0.936 -5394.795 5851.831 267 MoSold_str__"5.0" 3079.9956 2488.910 1.237 0.216 -1805.328 7965.320 268 MoSold_str__"6.0" 1490.7548 2188.979 0.681 0.496 -2805.854 5787.363 269 MoSold_str__"7.0" 3834.7253 2211.987 1.734 0.083 -507.044 8176.495 . 270 MoSold_str__"8.0" -2999.1183 2984.160 -1.005 0.315 -8856.537 2858.300 271 MoSold_str__"9.0" -4104.2182 3905.217 -1.051 0.294 -11800.000 3561.084 272 Neighborhood__Blmngtn -2405.0816 9971.748 -0.241 0.809 -22000.000 17200.000 273 Neighborhood__BrDale 9851.3805 11200.000 0.877 0.381 -12200.000 31900.000 274 Neighborhood__BrkSide -2760.2576 6654.659 -0.415 0.678 -15800.000 10300.000 275 Neighborhood__ClearCr 5860.3200 7524.168 0.779 0.436 -8908.393 20600.000 276 Neighborhood__CollgCr 429.6268 3933.180 0.109 0.913 -7290.562 8149.816 277 Neighborhood__Crawfor 8207.5569 6146.625 1.335 0.182 -3857.265 20300.000 278 Neighborhood__Edwards -20340.0000 4167.733 -4.881 0.000 -28500.000 -12200.000 *** 279 Neighborhood__Gilbert -1724.1870 5054.816 -0.341 0.733 -11600.000 8197.591 280 Neighborhood__IDOTRR -10990.0000 8823.758 -1.245 0.213 -28300.000 6333.598 281 Neighborhood__MeadowV -3435.2006 12700.000 -0.271 0.786 -28300.000 21400.000 282 Neighborhood__Mitchel -10220.0000 5083.734 -2.010 0.045 -20200.000 -240.406 * 283 Neighborhood__NAmes -9022.0852 3677.284 -2.453 0.014 -16200.000 -1804.177 * 284 Neighborhood__NWAmes -15400.0000 5647.824 -2.727 0.007 -26500.000 -4315.506 ** 285 Neighborhood__NoRidge 33900.0000 6727.434 5.039 0.000 20700.000 47100.000 *** 286 Neighborhood__NridgHt 23540.0000 5696.181 4.133 0.000 12400.000 34700.000 *** 287 Neighborhood__OldTown -13710.0000 6833.774 -2.006 0.045 -27100.000 -291.589 * 288 Neighborhood__Rare cases 6678.2466 11500.000 0.579 0.562 -15900.000 29300.000 289 Neighborhood__SWISU -24000.0000 8792.284 -2.730 0.006 -41300.000 -6745.482 ** 290 Neighborhood__Sawyer -5879.9162 4775.703 -1.231 0.219 -15300.000 3494.008 291 Neighborhood__SawyerW 1433.9447 4828.371 0.297 0.767 -8043.360 10900.000 292 Neighborhood__Somerst -1250.2866 7918.644 -0.158 0.875 -16800.000 14300.000 293 Neighborhood__StoneBr 34010.0000 7888.393 4.311 0.000 18500.000 49500.000 *** 294 Neighborhood__Timber -11840.0000 6608.489 -1.792 0.073 -24800.000 1128.047 . 295 Neighborhood__Veenker 9057.7049 9525.296 0.951 0.342 -9638.896 27800.000 296 OpenPorchSF 23.5879 15.876 1.486 0.138 -7.574 54.750 297 OverallCond 1754.3417 6936.585 0.253 0.800 -11900.000 15400.000 298 OverallCond_str__"3.0" -15170.0000 21600.000 -0.704 0.482 -57500.000 27100.000 299 OverallCond_str__"4.0" -7764.8147 14800.000 -0.524 0.600 -36800.000 21300.000 300 OverallCond_str__"5.0" -5589.7146 7867.117 -0.711 0.478 -21000.000 9852.151 301 OverallCond_str__"6.0" 2043.1315 2814.799 0.726 0.468 -3481.860 7568.123 302 OverallCond_str__"7.0" 7823.9711 7622.835 1.026 0.305 -7138.408 22800.000 303 OverallCond_str__"8.0" 9548.4183 14600.000 0.655 0.513 -19100.000 38200.000 304 OverallCond_str__"9.0" 9109.0698 22000.000 0.415 0.678 -34000.000 52200.000 305 OverallQual 1983.6366 6791.723 0.292 0.770 -11300.000 15300.000 306 OverallQual_str__"10.0" 55500.0000 24800.000 2.233 0.026 6721.551 104000.000 * 307 OverallQual_str__"3.0" -17780.0000 24100.000 -0.739 0.460 -65000.000 29500.000 308 OverallQual_str__"4.0" -14290.0000 16800.000 -0.849 0.396 -47300.000 18800.000 309 OverallQual_str__"5.0" -20170.0000 10900.000 -1.854 0.064 -41500.000 1183.551 . 310 OverallQual_str__"6.0" -19490.0000 4815.120 -4.048 0.000 -28900.000 -10000.000 *** 311 OverallQual_str__"7.0" -14130.0000 4552.060 -3.104 0.002 -23100.000 -5196.601 ** 312 OverallQual_str__"8.0" -1085.6766 10400.000 -0.104 0.917 -21500.000 19300.000 313 OverallQual_str__"9.0" 31450.0000 17600.000 1.782 0.075 -3185.175 66100.000 . 314 PavedDrive__N 3136.1507 3477.582 0.902 0.367 -3689.775 9962.077 315 PavedDrive__P -3798.6032 4199.782 -0.904 0.366 -12000.000 4444.883 316 PavedDrive__Y 657.6408 2779.464 0.237 0.813 -4797.993 6113.274 317 PoolArea -8.1456 26.330 -0.309 0.757 -59.827 43.536 318 PoolQC__Ex -3243.4115 3222.878 -1.006 0.315 -9569.395 3082.572 319 PoolQC__Fa 8739.4732 5773.192 1.514 0.130 -2592.359 20100.000 320 PoolQC__Gd -5500.8734 3223.896 -1.706 0.088 -11800.000 827.108 . 321 RoofMatl__CompShg 8915.2422 7608.402 1.172 0.242 -6018.808 23800.000 322 RoofMatl__Rare cases 13930.0000 7889.193 1.766 0.078 -1555.656 29400.000 . 323 RoofMatl__Tar&Grv -22850.0000 12000.000 -1.902 0.058 -46400.000 729.984 . 324 RoofStyle__Flat 11000.0000 15600.000 0.706 0.480 -19600.000 41600.000 325 RoofStyle__Gable -8087.0594 5292.277 -1.528 0.127 -18500.000 2300.817 326 RoofStyle__Gambrel -3055.7320 9425.583 -0.324 0.746 -21600.000 15400.000 327 RoofStyle__Hip -7536.8044 5552.231 -1.357 0.175 -18400.000 3361.318 328 RoofStyle__Rare cases 7675.3582 10800.000 0.711 0.477 -13500.000 28900.000 329 SaleCondition__Abnorml -4760.7094 5480.135 -0.869 0.385 -15500.000 5995.900 330 SaleCondition__Alloca 550.5266 11200.000 0.049 0.961 -21500.000 22600.000 331 SaleCondition__Family 1800.9023 7119.335 0.253 0.800 -12200.000 15800.000 332 SaleCondition__Normal 1920.4209 4757.179 0.404 0.687 -7417.144 11300.000 333 SaleCondition__Partial 484.0479 14000.000 0.035 0.972 -26900.000 27900.000 334 SaleType__COD -4057.7432 6637.617 -0.611 0.541 -17100.000 8970.816 335 SaleType__New 6082.6969 13100.000 0.463 0.643 -19700.000 31900.000 336 SaleType__Rare cases 6229.5543 6591.435 0.945 0.345 -6708.355 19200.000 337 SaleType__WD -8259.3197 5191.945 -1.591 0.112 -18500.000 1931.621 338 ScreenPorch 65.3563 16.306 4.008 0.000 33.350 97.362 *** 339 TotRmsAbvGrd 1442.9043 5429.991 0.266 0.791 -9215.281 12100.000 340 TotRmsAbvGrd_str__"10.0" 8582.9861 13700.000 0.627 0.531 -18300.000 35500.000 341 TotRmsAbvGrd_str__"11.0" 11050.0000 19300.000 0.573 0.567 -26800.000 48900.000 342 TotRmsAbvGrd_str__"12.0" -39060.0000 26400.000 -1.477 0.140 -91000.000 12800.000 343 TotRmsAbvGrd_str__"3.0" -3255.0682 24800.000 -0.131 0.896 -52000.000 45500.000 344 TotRmsAbvGrd_str__"4.0" 2314.5620 18800.000 0.123 0.902 -34500.000 39100.000 345 TotRmsAbvGrd_str__"5.0" 3842.8260 13500.000 0.285 0.776 -22600.000 30300.000 346 TotRmsAbvGrd_str__"6.0" 6572.5656 8329.981 0.789 0.430 -9777.825 22900.000 347 TotRmsAbvGrd_str__"7.0" 5729.3691 3948.586 1.451 0.147 -2021.060 13500.000 348 TotRmsAbvGrd_str__"8.0" 2298.1803 4358.857 0.527 0.598 -6257.544 10900.000 349 TotRmsAbvGrd_str__"9.0" 1920.7105 8970.486 0.214 0.831 -15700.000 19500.000 350 TotalBsmtSF 4.6293 6.069 0.763 0.446 -7.283 16.542 351 WoodDeckSF 7.1014 7.973 0.891 0.373 -8.548 22.751 352 YearBuilt 377.6051 122.994 3.070 0.002 136.188 619.022 ** 353 YearRemodAdd 79.4149 75.849 1.047 0.295 -69.464 228.293 354 YrSold -428.1827 151.320 -2.830 0.005 -725.199 -131.166 ** 355 YrSold_str__"2006.0" -947.4036 1826.616 -0.519 0.604 -4532.753 2637.946 356 YrSold_str__"2007.0" -1712.1666 1737.426 -0.985 0.325 -5122.451 1698.117 357 YrSold_str__"2008.0" -154.2332 1744.307 -0.088 0.930 -3578.022 3269.556 358 YrSold_str__"2009.0" -8.8170 1736.092 -0.005 0.996 -3416.482 3398.848 359 YrSold_str__"2010.0" 2817.8087 2282.426 1.235 0.217 -1662.221 7297.838 ------------------------- --- Model statistic --- R-squared : 0.925 Adj. R-squared : 0.901 F-statistic : 38 Prob (F-statistic): 0.0 No. Observations : 1095 AIC : 25536 Df Residuals : 827 BIC : 26876 RMSE (test) : 32690 ------------------------- Maximum correlation between Reseduals and any data columns is 6.521258747279969e-13, with columns <LotArea> Mean of train reseduals: 2.850851758498035e-08 ------------------------------------- Random Forest ------------------------------------- ------------------------- RF model peramters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': True, 'random_state': None, 'verbose': 0, 'warm_start': False} -------------------------
-------------------------
--- Model statistic ---
R^2 (test) : 0.8402415243284254
R^2 (train): 0.9809782507599958
RMSE (test): 30828
oob score : 0.859404130508701
-------------------------
Maximum correlation between Reseduals and any data columns is 0.3735543313645223, with columns <TotRmsAbvGrd_str__"11.0">